package data_deepprocessing.algorithm.cvalue_ncvalue_tfidf.util;

import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import data_deepprocessing.algorithm.cvalue_ncvalue_tfidf.main.NCValue_Compute_Main;
import uk.ac.shef.dcs.oak.jate.core.algorithm.NCValueAlgorithm;
import uk.ac.shef.dcs.oak.jate.core.context.ContextExtraction;
import uk.ac.shef.dcs.oak.jate.util.control.StopList;



public class NCValueAlgorithmYYH extends NCValueAlgorithm{
	
	static Logger logger = Logger.getLogger(NCValueAlgorithmYYH.class.getName());

	public NCValueAlgorithmYYH(ContextExtraction contextExtract, StopList stoplist) {
		super(contextExtract, stoplist ,null);
		// TODO Auto-generated constructor stub
	}
	
	/** Each candidate term from the C-value list appears in the corpus with a set of context words (may be an adjective, noun or a verb).

	 * This function identifies the context words with respect to all the candidates identified in the C-Value result set. 
	 * 
	*/
	protected void ContextIdentification_Terms() throws IOException{
//		protected void ContextIdentification_Terms(List<String> sentences) throws IOException{
			List<String> sentences = NCValue_Compute_Main.getOrderedSentences();
//			sentLoop:
			for (String sentence: sentences) {  //把document 变为 sentence
//				for (String sentence: sentences) {  //把document 变为 sentence
				
//				sentence = MyUtil.getRegularizedSentence(sentence);  //把sentence中的内容给处理一下，不是字符和数字的内容给替换成空格

				Set<String> sent_CValueTermVariants = new HashSet<String>();
				
				sent_CValueTermVariants.addAll(UtilityYYH.getTermVariants_sent(sentence, CValueTerms_Variants));  //由于一个句子中包含的variants有限，所以这里只返回了句子中有的variants		

				if(sent_CValueTermVariants.size()>0){
					
					Set<String> ContextWords = new HashSet<String>();
					
					ContextWords = contextExtraction.ExtractContextWords(sentence, sent_CValueTermVariants);
					
					Map<String,Integer> freqMap = CalculateFrequency(sentence, ContextWords);  //Map<context,frequency>
					
					//这一点牵扯到对词的筛选，先不要动它
//					Map<String,Integer> freqMap_copy = new HashMap<String, Integer>();
//					
//					freqMap_copy.putAll(freqMap);
//					
//					Iterator<Entry<String, Integer>> it = freqMap_copy.entrySet().iterator();
//					
//					while (it.hasNext()){
//						
//					    Entry<String,Integer> e = it.next();
//					    
//					    String lemmatized_Context = Utility.getLemma(e.getKey(),stoplist,lemmatizer);
//					    
//						if(lemmatized_Context!= null && !lemmatized_Context.equals(e.getKey())){
//							
//							freqMap.put(lemmatized_Context, e.getValue());
//							
//							freqMap.remove(e.getKey());
//							
//						}
//					}
					
					for(String term : sent_CValueTermVariants){
						
//						term = Utility.getLemma(term,stoplist,lemmatizer);
						
//						if(freqMap.size() <= 0){
						
//							continue sentLoop;
						
//						}

						for(String contextWord : freqMap.keySet()){
							
							if(!Term_CW_Map.containsKey(term)){
								
								Set<String> contextWordSet = new HashSet<String>();
								
								contextWordSet.add(contextWord);
								
								Term_CW_Map.put(term, contextWordSet);
								
							}else{
								
								Term_CW_Map.get(term).add(contextWord);
								
							}
							
							if(!Term_CW_freqMap.containsKey(term+"+"+contextWord)){
								
								Term_CW_freqMap.put(term+"+"+contextWord, freqMap.get(contextWord));
								
							}else{
								
								Term_CW_freqMap.put(term+"+"+contextWord,Term_CW_freqMap.get(term+"+"+contextWord)+freqMap.get(contextWord));
								
							}
								
						}
					}
				}
			}
			logger.info("Term_CW_Map");
			for(String term : Term_CW_Map.keySet()){
				logger.info(term + "->_" +Term_CW_Map.get(term));
			}
			logger.info("Term_CW_freqMap");
			for(String term : Term_CW_freqMap.keySet()){
				logger.info(term + "->_" +Term_CW_freqMap.get(term));
			}
			
			
	}
	
	/*Calculates the frequency of context words in a sentence.*/
	//这里应该是某一个term对应的contextWords在句子中的出现的情况


	protected  Map<String, Integer> CalculateFrequency(String sent, Set<String> contextWords) {
		Map<String,Integer> freqMap = new HashMap<String,Integer>();
		// TODO Auto-generated method stub


		if(contextWords.size()<=0){
			return freqMap;
		}
		
		for(String word: contextWords){
			int count = 0;
			
			Pattern pattern = Pattern.compile(word);
			
			Matcher matcher = pattern.matcher(sent);
			
			while(matcher.find()){
				++count;
			}
			
			if(freqMap.containsKey(word)){
				freqMap.put(word, freqMap.get(word)+count);
			}
			else
				freqMap.put(word, count);
		}		
		
		logger.info("freqMap ");
		for(String word: freqMap.keySet()){
			logger.info(word + " : " + freqMap.get(word));
		}
		return freqMap;
	}

}
